import PyPDF2
import os
import re
from collections import defaultdict
import pandas as pd
import datetime
import pdfplumber

#Run file in the script directory

cwd = os.getcwd()
path = cwd[:-7]

os.chdir(path)


#List of Supreme Court Actions
section_titles = ['Summary Disposition', 'Orders in Pending Cases','Certiorari Denied',
                  '&Per Curiam', '&Habeas Corpus','&Mandamus Denied','&Prohibition Denied',
                 '&Rehearings Denied','&Attorney Discipline','&Admission','&Oral Argument',
                '&Opinion&','&Certiorari Granted','&Motion Denied','&Jurisdiction Postponed','Order in Pending Case']

#Terms
years = ['93','94','95','96','97','98','99','00',
         '01','02','03','04','05','06','07','08','09','10',
         '11','12','13','14','15','16','17','18','19','20',
        '21','22', '23','24']

months = ["JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY",
          "JUNE", r"JUL\s{0,1}Y", "AUGUST", "SEPTEMBER", "OCTOBER", "NOVEMBER", "DECEMBER"]
week = [r"SUND\s{0,1}AY", r"MOND\s{0,1}AY", r"TUESD\s{0,1}AY", r"WEDNESD\s{0,1}AY", 
        r"THURSD\s{0,1}AY", r"FRID\s{0,1}AY", r"SATURD\s{0,1}AY"]


replace_newline = ['\nPer Curiam', '\nHabeas Corpus','\nMandamus Denied','\nProhibition Denied',
                 '\nRehearings Denied','\nAttorney Discipline','\nOral Argument',
                 '\nCertiorari Granted','\nNo.','\nAdmission','\nMotion Denied','\nJurisdiction Postponed']


#Split report from a given day into sections
def case_part_maker(sessions):
    case_parts = defaultdict(str)
    for session in sessions[1:]:
        current_day = 'NA'
        for day in week:
            for month in months:
                s=month+' [0-9]{1,2},{0,1} [0-9]{4}'
                dates = re.findall(s,session)
                if dates!=[]:
                    current_day=dates[0]
                page = r'\d{1,4} '+day+',{0,1}'
                session = re.sub(page,'',session)
                session = re.sub(s,'',session)
                
        present = []
        for section in section_titles:
            if section in session:
                present.append(section)
        for section in present:
            secs = session.split(section) 
            for sec in secs[1:]:
                for section1 in present:
                    if section1!= section:
                        sec = sec.split(section1)[0]
                case_parts[(section,current_day)] +=sec   
    return case_parts

actions = defaultdict(list)
for year in years:
    name = 'Data/SC_Journals/SCJ_'+year+'.pdf'
    date_format = "%B %d, %Y"
    pdfFileObj = open(name, 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    pdf_len = len(pdfReader.pages)
    t = ''
    for i in range(pdf_len):
        pageObj = pdfReader.pages[i]
        page_txt = pageObj.extract_text()
        t +=page_txt
        
    #Clean the text
    t = t.replace('\xad','') 
    t = t.replace('-\n','') 
    t = t.replace('  ',' ')
    t = t.replace('Opinion Per Curiam','\nPer Curiam')
    t = t.replace('Opinions Per Curiam','\nPer Curiam')
    t = t.replace('\nRehearing Denied','\nRehearings Denied')
    t = re.sub(r'A\s*d\s*m\s*i\s*s\s*s\s*i\s*o\s*n','Admission',t)
    t = re.sub(r'\nOpinions{0,1}\s*\n','&Opinion&\n',t)
    for entry in replace_newline:
        t=t.replace(entry,'&'+entry[1:])
    t = t.replace('\n','')
    t = t.replace('  ',' ')
    t=t.replace('Order in Pending Case','Orders in Pending Cases')
    sessions = re.sub('JOURN AL', 'JOURNAL',t)
    sessions = re.sub('JUL Y', 'JULY',sessions)
    sessions = re.sub('FEBR UARY', 'FEBRUARY',sessions)
    sessions = re.sub('JANU ARY', 'JANUARY',sessions)

    for month in months:
        for day in range(1,32):
            sessions = sessions.replace(month+str(day),month+' '+str(day))
    sessions = sessions.split('(JOURNAL)')
    case_parts = case_part_maker(sessions)
    
    #Split each setction into dockets 
    for part in case_parts:
        if part[0]=='&Admissions to the Bar':
            t = '&No.'+case_parts[part]
            temp=t.split('&No.')
        else:
            temp=case_parts[part].split('&No.')
        for i in range(1,len(temp)):
            dn = 'NA'
            s='No.'+temp[i]
            s = s.replace('&','').replace('±','–').replace('– ','–').replace('-','–') 
            docket_number = re.findall(r'No\.\s{0,1}[0-9]{2}.+?[0-9]{1,5}',s[:20])
            docket_number_org = re.findall(r'No\.\s{0,1}[0-9]{1,4},{0,1} Orig',s[:20])
            docket_number_weird = re.findall(r'No\.\s{0,1}[AMSD]{1}.+?[0-9]{1,5}',s[:20])
            docket_number_missing = re.findall(r'No\.–––\.',s[:20])
            if docket_number !=[]:
                dn = docket_number[0]
            elif docket_number_org !=[]:
                dn = docket_number_org[0]
            elif docket_number_weird !=[]:
                dn = docket_number_weird[0]
            elif docket_number_missing !=[]:
                dn = docket_number_missing[0]
            actions['Docket Number'].append(dn)
            actions['Term'].append(year)
            try: 
                actions['Date'].append(str(datetime.datetime.strptime(part[1], date_format).date()))
            except:
                actions['Date'].append(str(datetime.datetime.strptime(part[1], '%B %d %Y').date()))
            actions['Action'].append(part[0])
            actions['Text'].append(s)


actions = pd.DataFrame(data=actions)
actions.to_excel('Data/Processed_Data/Docket_Day.xlsx')


#Scrape Tables

potential_tables = defaultdict(list)
for year in years:
    name = 'Data/SC_Journals/SCJ_'+year+'.pdf'
    with pdfplumber.open(name) as pdf:
        for page in pdf.pages:
            t=page.extract_table(table_settings={"vertical_strategy": "explicit", 
                                             "horizontal_strategy": "text", 
                                             "explicit_vertical_lines":[150,280,380,465],
                                             "snap_tolerance": 6,})
            if ['Applicant', 'CityandState', 'Movant'] in t or ['Applicant', 'City and State', 'Movant'] in t:
                potential_tables[year].append(t)


admissions={}
for year in years:
    applicants = defaultdict(list)
    for i in range(len(potential_tables[year])):
        try:
            date = potential_tables[year][i][2][1]
            date=re.sub(r'.+?([A-Z]+?)\s{0,1}(\d{1,2}),{0,1}\s*(\d{4})',r'\1 \2, \3',date)
            date = str(datetime.datetime.strptime(date, '%B %d, %Y').date())
        except:
            date = potential_tables[year][i][0][1]
            date=re.sub(r'.+?([A-Z]+?)\s{0,1}(\d{1,2}),{0,1}\s*(\d{4})',r'\1 \2, \3',date)
            date = str(datetime.datetime.strptime(date, '%B %d, %Y').date())
        space = False
        try: 
            n=potential_tables[year][i].index(['Applicant', 'City and State', 'Movant'])
            space = True
        except: 
            n=potential_tables[year][i].index(['Applicant', 'CityandState', 'Movant'])
        names = potential_tables[year][i][n+2:]
        try: 
            if space:
                n=names.index(['Applicant', 'City and State', 'Movant'])
            else:
                n=names.index(['Applicant', 'CityandState', 'Movant'])
            names1 = names
            names2 = names[n+2:]
            end1=names1.index(['','',''])
            names1 = names1[:end1]
            try:
                end2 = names2.index(['','',''])
                names2 = names2[:end2]
            except: 
                pass
            applicants[date]=applicants[date]+names1+names2
        except: 
            try:
                end=names.index(['','',''])
                names = names[:end]
            except:
                pass
            applicants[date]=applicants[date]+names
        cleaned_applicants = defaultdict(list)
        name=''
        location=''
        movant = ''
        for date in applicants:
            for applicant in applicants[date]:
                name_temp = applicant[0]
                location_temp = applicant[1]
                movant_temp = applicant[2]
                if name_temp=='' or location_temp=='' or movant_temp =='':
                    name = name + ' ' + name_temp
                    location = location + ' ' + location_temp
                    movant = movant + ' '+ movant_temp
                else:
                    name = re.sub(r'([A-Z].+?)([A-Z].+?)([A-Z].+?)([A-Z].+?)', r'\1 \2 \3 \4',name)
                    name = re.sub(r'([A-Z].+?)([A-Z].+?)([A-Z].+?)', r'\1 \2 \3',name)
                    name = re.sub(r'([A-Z].+?)([A-Z].+?)', r'\1 \2',name)
                    name = name.replace('   ',' ')
                    name = name.replace('  ',' ')
                    name = name.replace('- ','-')
                    movant = re.sub(r'([A-Z].+?)([A-Z].+?)([A-Z].+?)([A-Z].+?)', r'\1 \2 \3 \4',movant)
                    movant = re.sub(r'([A-Z].+?)([A-Z].+?)([A-Z].+?)', r'\1 \2 \3',movant)
                    movant = re.sub(r'([A-Z].+?)([A-Z].+?)', r'\1 \2',movant)
                    movant = movant.replace('   ',' ')
                    movant = movant.replace('  ',' ')
                    movant = movant.replace('- ','-')
                    cleaned_applicants[date].append([name,location,movant])
                    name = name_temp
                    location = location_temp
                    movant = movant_temp
    admissions[year]=cleaned_applicants
terms = []
dates = []
texts = []
for year in admissions:
    if int(year)<30:
        term = '20'+year
    else:
        term = '19'+year
    for date in admissions[year]:
        for attorney in admissions[year][date]:
            if attorney !=['','','']:
                text = attorney[0].replace(',','')+', '+attorney[1].replace(',','')+', '+attorney[2].replace(',','')
                texts.append(text)
                dates.append(date)
                terms.append(int(term))
df = pd.DataFrame({'docket_number':'AttB', 'term':terms, 'date':dates, 'action':'Admission','text':texts,
                   'cleaned_docket':'AttB','action_text': texts, 'specific_action': texts})
df.to_excel('Data/Processed_Data/SC_Bar_Admissions.xlsx', index = False)

